{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl\n",
    "import pandas as pd\n",
    "import datetime as dt\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "from dateutil.relativedelta import relativedelta\n",
    "import matplotlib.pyplot as plt\n",
    "import sys\n",
    "from dateutil.relativedelta import relativedelta\n",
    "from datetime import datetime, timedelta\n",
    "sys.path.append(\"../\")\n",
    "import tools\n",
    "\n",
    "pl.Config.set_fmt_str_lengths(100);\n",
    "pl.enable_string_cache(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Paths\n",
    "path_output = '' # curated data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Advan indeed data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Open the data, classified and non-classified\n",
    "advan_indeed = pl.read_parquet(path_output+ 'df_matches_all_naics.parquet')\n",
    "advan_indeed_nm = pl.read_parquet(path_output+'indeed_all_jobs.parquet')\n",
    "advan_indeed = advan_indeed.sort('date_first_visible')\n",
    "advan_indeed_nm = advan_indeed_nm.sort('date_first_visible')\n",
    "print('Number of matched observations: ', advan_indeed.shape)\n",
    "print('Number of total observations: ', advan_indeed_nm.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Last date of the data\n",
    "print('Maximum date matched: ',advan_indeed['date_first_visible'].max())\n",
    "print('Maximum date total: ',advan_indeed_nm['date_first_visible'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# put the NAICS column into number format\n",
    "advan_indeed = (\n",
    "    advan_indeed.with_columns(pl.col('modal_naics').cast(pl.Int64)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rename columns\n",
    "advan_indeed = advan_indeed[['date_first_visible', 'modal_naics', 'province', 'cleaned_name', 'last_date']].rename({'date_first_visible' :'date', 'modal_naics': 'naics_indeed'})\n",
    "advan_indeed_nm = advan_indeed_nm[['date_first_visible', 'province', 'company_name', 'last_date']].rename({'date_first_visible' :'date'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(advan_indeed.describe())\n",
    "print(advan_indeed_nm.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# transform the NAICS into a 2-digit code\n",
    "advan_indeed = advan_indeed.with_columns(pl.col('naics_indeed').cast(pl.Utf8).str.slice(0, 2))\n",
    "print(advan_indeed['naics_indeed'].value_counts().sort('naics_indeed').to_pandas())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate a lit of unique NAICS codes for the loop\n",
    "naics = advan_indeed['naics_indeed'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stocks by NAICS\n",
    "ts_nat = []\n",
    "\n",
    "for code in naics:\n",
    "    df_temp = (\n",
    "        advan_indeed\n",
    "        .filter(pl.col('naics_indeed') == f'{code}')\n",
    "    )\n",
    "    ts_n = (\n",
    "        tools.create_time_series(df_temp, first_date_col='date', last_date_col = 'last_date')\n",
    "        .rename({'vacancies': f'{code}'})\n",
    "        )\n",
    "    ts_nat.append(ts_n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stocks of the total\n",
    "ts_nat_nm = []\n",
    "\n",
    "ts_n_nm = (\n",
    "        tools.create_time_series(advan_indeed_nm, first_date_col='date', last_date_col = 'last_date')\n",
    "        .rename({'vacancies': f'total'})\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# collect the data in a datframe\n",
    "ts_new = ts_nat[0]\n",
    "for other_ts in ts_nat[1:]:\n",
    "    ts_new = ts_new.join(other_ts, how='outer', on='date')\n",
    "ts_new = ts_new.join(ts_n_nm.select(['date', 'total']), how='outer', on='date')\n",
    "ts_new = ts_new.sort('date')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Functions to put the data monthly -- Note that this takes the mean of the month\n",
    "# group\n",
    "def resample (df, new_df, freq):\n",
    "    for col in df.columns[1:]:\n",
    "        temp = df.groupby_dynamic(index_column=\"date\", every=freq).agg(pl.col(f'{col}').mean())\n",
    "        new_df.append(temp)\n",
    "# construct the dataset        \n",
    "def ts_resampled (df):\n",
    "    # monthly stocks of job postings\n",
    "    final_df = df[0]\n",
    "    for other_ts in df[1:]:\n",
    "        final_df = final_df.join(other_ts, how='outer', on='date')\n",
    "    return final_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create monthly data\n",
    "resampled_nat_m = []\n",
    "resample(ts_new, resampled_nat_m, '1mo')\n",
    "ts_nat_m = ts_resampled(resampled_nat_m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# renaming columns\n",
    "for col in ts_nat_m.columns[1:]:\n",
    "    ts_nat_m = ts_nat_m.rename({f'{col}': f'{\"indeed_\"+ col}'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_nat_m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Renaming and grouping the NAICS to match JVWS\n",
    "ts_nat_m = ts_nat_m.rename({'indeed_42': 'indeed_41', 'indeed_92': 'indeed_91'})\n",
    "ts_nat_m = ts_nat_m.with_columns (\n",
    "    (pl.col('indeed_31')+pl.col('indeed_32')+pl.col('indeed_33')).alias('indeed_31')\n",
    ").drop(['indeed_32', 'indeed_33'])\n",
    "ts_nat_m = ts_nat_m.with_columns (\n",
    "    (pl.col('indeed_44')+pl.col('indeed_45')).alias('indeed_44')\n",
    ").drop(['indeed_45'])\n",
    "ts_nat_m = ts_nat_m.with_columns (\n",
    "    (pl.col('indeed_48')+pl.col('indeed_49')).alias('indeed_48')\n",
    ").drop(['indeed_49'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts_nat_m.write_csv(path_output+'indeed_naics_moavg.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env_indeed2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
